rm(list = ls())
library(tidyverse)
library(readr)
library(caret)
library(quanteda)
library(readxl)
library(plyr)


setwd(dirname(rstudioapi::getSourceEditorContext()$path))

source("r_i_text.R")

training_data <- read_excel("input_data/erc2020_complete.xlsx")
colnames(training_data)
prediction_data_p1 <- read_csv("input_data/analysisRCN.csv")
prediction_data_p2 <- read_csv("input_data/analysisRCN2020_2021New.csv")
prediction_data_p2$title <- ""
prediction_data <- rbind(prediction_data_p1,prediction_data_p2)
# in the prediction data, we only have abstract and title as predictors.

which_panel <- "PE"

training_data <- training_data[training_data$Domain==which_panel,]
training_data$Domain <- NULL
prediction_data <- prediction_data[prediction_data$panelProg==which_panel,]
prediction_data$panelProg <- NULL
# prediction_data$idproject <- NULL

training_data <- training_data %>% dplyr::select(c("Abstract","Panel","Title"))

# rename the usable columns so that training and prediction set match
colnames(prediction_data)[colnames(prediction_data) == 'title'] <- 'Title'
colnames(prediction_data)[colnames(prediction_data) == 'abstract'] <- 'Abstract'

##### merge training and prediction data
training_data$predict_set <- 0
prediction_data$predict_set <- 1

whole_data <- rbind.fill(training_data,prediction_data)

# merge together title and abstract to start with:
whole_data <- r_i_text(whole_data,"Abstract","Title",1)
colnames(whole_data)[colnames(whole_data) == 'Abstract'] <- 'all_text'
whole_data$Title <- NULL

# replacing "/" and "-" by spaces
whole_data$all_text <- gsub("\r", " ", whole_data$all_text)
whole_data$all_text <- gsub("\n", " ", whole_data$all_text)
whole_data$all_text <- gsub("/", " ", whole_data$all_text)
whole_data$all_text <- gsub("-", " ", whole_data$all_text)
whole_data$all_text <- gsub("\\.", "[SEP] [CLS] ", whole_data$all_text)

max(ntoken(whole_data$all_text))
whole_data$panel_resp <- NULL
for (iii in 1:nrow(whole_data)) {
  whole_data$panel_resp[iii] <- as.numeric(substr(whole_data$Panel[iii],3,4))
}
# whole_data$panel_resp <- as.numeric(whole_data$panel_resp>5)
whole_data$panel_resp <- whole_data$panel_resp-1

whole_data$Panel <-NULL

# saving training and prediction set separately
training_data <- whole_data[whole_data$predict_set==0,]
training_data$predict_set <- NULL

prediction_data <- whole_data[whole_data$predict_set==1,]
prediction_data$predict_set <- NULL
prediction_data$panel_resp <- NULL

training_data$idproject <- NULL

write_csv(training_data,paste("input_data/",which_panel,"_training_data_app.csv",sep = ""))
write_csv(prediction_data,paste("input_data/",which_panel,"_prediction_data_app.csv",sep = ""))



